# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
bike = pd.read_csv('201902-fordgobike-tripdata.csv')
print(bike.shape)
print( )
bike.info()
print( )
bike.head()
(183412, 16) <class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
bike.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip'],
dtype='object')
bike.describe()
| duration_sec | start_station_id | start_station_latitude | start_station_longitude | end_station_id | end_station_latitude | end_station_longitude | bike_id | member_birth_year | |
|---|---|---|---|---|---|---|---|---|---|
| count | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183215.000000 | 183412.000000 | 183412.000000 | 183412.000000 | 175147.000000 |
| mean | 726.078435 | 138.590427 | 37.771223 | -122.352664 | 136.249123 | 37.771427 | -122.352250 | 4472.906375 | 1984.806437 |
| std | 1794.389780 | 111.778864 | 0.099581 | 0.117097 | 111.515131 | 0.099490 | 0.116673 | 1664.383394 | 10.116689 |
| min | 61.000000 | 3.000000 | 37.317298 | -122.453704 | 3.000000 | 37.317298 | -122.453704 | 11.000000 | 1878.000000 |
| 25% | 325.000000 | 47.000000 | 37.770083 | -122.412408 | 44.000000 | 37.770407 | -122.411726 | 3777.000000 | 1980.000000 |
| 50% | 514.000000 | 104.000000 | 37.780760 | -122.398285 | 100.000000 | 37.781010 | -122.398279 | 4958.000000 | 1987.000000 |
| 75% | 796.000000 | 239.000000 | 37.797280 | -122.286533 | 235.000000 | 37.797320 | -122.288045 | 5502.000000 | 1992.000000 |
| max | 85444.000000 | 398.000000 | 37.880222 | -121.874119 | 398.000000 | 37.880222 | -121.874119 | 6645.000000 | 2001.000000 |
bike.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 197 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64
bike['member_birth_year'].fillna(bike['member_birth_year'].mode()[0], inplace = True)
bike['member_gender'].fillna(bike['member_gender'].mode()[0], inplace = True)
bike.dropna(inplace = True)
bike.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 0 start_station_name 0 start_station_latitude 0 start_station_longitude 0 end_station_id 0 end_station_name 0 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 0 member_gender 0 bike_share_for_all_trip 0 dtype: int64
bike['start_time'] = pd.to_datetime(bike['start_time'], format= '%Y-%m-%d %H:%M:%S')
bike['end_time'] = pd.to_datetime(bike['end_time'], format= '%Y-%m-%d %H:%M:%S')
bike['member_birth_year'] = bike['member_birth_year'].astype(int)
bike.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 183215 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183215 non-null int64 1 start_time 183215 non-null datetime64[ns] 2 end_time 183215 non-null datetime64[ns] 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183215 non-null float64 6 start_station_longitude 183215 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183215 non-null float64 10 end_station_longitude 183215 non-null float64 11 bike_id 183215 non-null int64 12 user_type 183215 non-null object 13 member_birth_year 183215 non-null int32 14 member_gender 183215 non-null object 15 bike_share_for_all_trip 183215 non-null object dtypes: datetime64[ns](2), float64(6), int32(1), int64(2), object(5) memory usage: 23.1+ MB
bike_clean = bike.copy()
bike_clean['start_day'] = bike_clean['start_time'].dt.day
bike_clean['start_month'] = bike_clean['start_time'].dt.month
bike_clean['start_year'] = bike_clean['start_time'].dt.year
bike_clean['start_weekday'] = bike_clean['start_time'].dt.weekday
bike_clean['start_weekday_name'] = bike_clean['start_time'].dt.day_name()
bike_clean['end_day'] = bike_clean['end_time'].dt.day
bike_clean['end_month'] = bike_clean['end_time'].dt.month
bike_clean['end_year'] = bike_clean['end_time'].dt.year
bike_clean['end_weekday'] = bike_clean['end_time'].dt.weekday
bike_clean['end_weekday_name'] = bike_clean['end_time'].dt.day_name()
bike_clean['duration_min'] = round(bike_clean['duration_sec']/60)
bike_clean['duration_min']
0 870.0
1 709.0
2 1031.0
3 608.0
4 26.0
...
183407 8.0
183408 5.0
183409 2.0
183410 2.0
183411 5.0
Name: duration_min, Length: 183215, dtype: float64
bike_clean['member_age'] = (2019 - bike_clean['member_birth_year'])
bike_clean['member_age']
0 35
1 31
2 47
3 30
4 45
..
183407 23
183408 35
183409 29
183410 31
183411 30
Name: member_age, Length: 183215, dtype: int32
#bike_clean.shape
bike_clean.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'start_station_latitude',
'start_station_longitude', 'end_station_id', 'end_station_name',
'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type',
'member_birth_year', 'member_gender', 'bike_share_for_all_trip',
'start_day', 'start_month', 'start_year', 'start_weekday',
'start_weekday_name', 'end_day', 'end_month', 'end_year', 'end_weekday',
'end_weekday_name', 'duration_min', 'member_age'],
dtype='object')
The dataset originally has 183412 rows and 16 columns which are 'duration_sec', 'start_time', 'end_time', 'start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type'user typr shou, 'member_birth_year', 'member_gender', 'bike_share_for_all_trip' but after wrangling the dataset, we have 183215 rows and 28 columns which are duration_sec', 'start_time', 'end_time', 'start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type', 'member_birth_year', 'member_gender', 'bike_share_for_all_trip', 'start_day', 'start_month', 'start_year', 'start_weekday', 'start_weekday_name', 'end_day', 'end_month', 'end_year', 'end_weekday', 'end_weekday_name', 'duration_min'and 'member_age'.
I did be looking out for insights on how the columns affects the bike system
I think weekday name, member age , user type should really help me in my investigation.
I start by looking at the gender
Count per gender
base_color = sns.color_palette()[0]
plt.title("Count per gender ")
gender = bike_clean['member_gender'].value_counts().index
sns.countplot(data=bike_clean, y='member_gender', color=base_color, order=gender);
The male gender has rides bikes the most with a count of 138763 and least with the others(i.e gender not identified)
user type per count
count = bike_clean['user_type'].value_counts()
label = ['Subscriber', 'Customer']
colors = ["#8c564b", "#ff7f0e"]
fig = plt.figure(figsize =(10, 7))
plt.pie(count, labels = label, autopct='%1.1f%%', colors=colors)
plt.title("User Type ")
# show plot
plt.show()
We have subscribers of 89.2% and Customers of 10.8% Most people who rides ford gobikes are more of subcribers
Checking for the percentage of people who do share trips and those who do not
count = bike_clean['bike_share_for_all_trip'].value_counts()
label = ['No', 'Yes']
colors = ["#8c564b", "#ff7f0e"]
fig = plt.figure(figsize =(10, 7))
plt.pie(count, labels = label, autopct='%1.1f%%', colors=colors)
plt.title("bike share for all trip ")
# show plot
plt.show()
Here, we can see we have more of people who do not share rides with a total of 90.5% to people who do share rides which is just 9.5%
Duration of rides per seconds
plt.figure(figsize = (6, 4), dpi = 100)
sns.displot(bike_clean, x="duration_sec");#, kde = False)
plt.xlim(0, 4000)
plt.title("Trip per seconds'")
plt.xlabel('Duration in secons')
plt.ylabel('Frequency')
plt.show();
<Figure size 600x400 with 0 Axes>
fig = px.histogram(bike_clean, x = 'duration_min', title = 'Rides per minutes')
fig.show()
average = sum(bike_clean['duration_min'])/len(bike_clean['duration_min'])
round(average, 0)
12.0
The duration with the highest rides are those with the short minutes and we can see that most rised occurs for just 6 minutes and also the average minutes for each ride is 12 minutes.
Age range for each rides
fig = px.histogram(bike_clean, x = 'member_age', title = 'Age per rides')
fig.show()
Taking further insights by looking at the ages of people who ride Ford GoBikes the most, We can see most of the people who rides bikes are between the age of 22 and 40 which are mostly work age range.
Weekday Per Rides
weekdays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
fig, ax = plt.subplots(nrows=2, figsize = [8,8])
#plt.title = 'Rides per weekdays'
default_color = sns.color_palette()[0]
sns.countplot(data = bike_clean, x = 'start_weekday_name', color = default_color, ax = ax[0], order = weekdays).set(title = 'Rides per weekdays')
sns.countplot(data = bike_clean, x = 'end_weekday_name', color = default_color, order = weekdays)
plt.show()
Here, we can see most rides occurs on weekdays and mostly on Thursdays to weekends, we can say this is due prolly because they are workdays.
Trip per hours
bike_clean['tim'] = bike_clean['start_time'].dt.time
bike_clean['hh'] = bike_clean['start_time'].dt.strftime('%H')
plt.figure(figsize = [15, 10]);
base_color = sns.color_palette()[0]
plt.title("Count per hour ")
hour = bike_clean['hh'].value_counts().index
sns.countplot(data=bike_clean, y='hh', color = base_color, order=hour);
Most trips occurs around 5pm and 8am, and its very obvious this is due to rush hours i.e the time most people do leave work and time most people do leave for work.
Top 10 Stations
fig, ax = plt.subplots(nrows=2, figsize = [8,8])
bike_clean['start_station_name'].value_counts()[:10].plot(kind='barh', ax = ax[0], title = 'Top 10 start_station')
bike_clean['end_station_name'].value_counts()[:10].plot(kind='barh', title = 'Top 10 end_station');
Noticed we have similar stations for the top 10 start and end stations
The male gender has the most records,Most of people who takes the trips are subscribers, Ages that takes rides the most is age 31 and most trips ocuurs for just 6 mins.
No there isn't any unusual distribution in the dataset.
Gender per user type
sns.countplot(data = bike_clean, x = 'member_gender', hue = 'user_type').set(title = 'gender per user types');
We have more males who are subscribers to female
Gender per bike share
sns.countplot(data = bike_clean, x = 'member_gender', hue = 'bike_share_for_all_trip').set(title = 'gender per bike share');
Most males do not share trips
Trip duration per days
base_color = sns.color_palette()[0]
count = bike_clean['start_weekday_name'].value_counts().index
plt.title("Count per weekday ")
gender = bike_clean['start_weekday_name'].value_counts().index
sns.lineplot(data=bike_clean, x='start_weekday_name', y='duration_min', color=base_color);
plt.xticks(rotation=90);
Trips are faster on weekdays to weekends
In all, We have more males who are subscribers to female, Most males do not share trips and Trips are faster on weekdays to weekends.
ax = sns.barplot(data = bike_clean, x = 'member_gender', y = 'duration_min', hue = 'user_type')
#plt.title("Relationship between gender, duration in minutes and user type ")
ax.legend(loc = 8, ncol = 3, framealpha = 1, title = 'user type');
-- Customers spends more time on rides than the subscribers
-- The male and female gender spends averagely the same time on rides
ax = sns.pointplot(data = bike_clean, x = 'member_gender', y = 'duration_min', hue = 'bike_share_for_all_trip', dodge = 0.3, linestyles = "").set(title = "Relationship between gender, duration in minutes and bike_share_for_all_trip ")
The male gender spends less time and also have the least of those who share trips
plt.figure(figsize = [15, 10]);
sns.scatterplot(data=bike_clean, x="member_age", y="duration_min", hue="member_gender", style="member_gender").set(title = "Relationship between member age, duration in minutes and gender");
Younger males tends to spends more time on rides to older males
plt.figure(figsize = [15, 10]);
sns.scatterplot(data=bike_clean, x="member_age", y="duration_min", hue="user_type", style="member_gender").set(title = "Relationship between member age, duration in minutes and gender");
We can see Customers spends more time on rides than the subscribers, and also most people whose gender is not identified spends more time to those whose gender is identified, So, we can say the time spent on the rides doesn't really depends on whether the person is a male or a female sine they spends equal minutes on rides.
The male gender are the least of those who share trips
After the wrangling and visualizations, We can infer that:
-- We have more males to female
-- There are more of people who are subscribers to customers
-- Most trips are done within short hours.
-- We have higher trips on weekdys to weekends
-- Most ofthe trip is made around 5pm and 8am and we deduce this is due to rush hours.
-- We have younger people who board bikes to older people